{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 思考？如何提升成绩"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TFIDF  词的文本  字    那么 词 + 字的特征   逻辑回归\n",
    "## 逻辑回归  LightGBM Xgboost SVM "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 读取数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv('./data/train_set.csv')\n",
    "test = pd.read_csv('./data/test_set.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>article</th>\n",
       "      <th>word_seg</th>\n",
       "      <th>class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>7368 1252069 365865 755561 1044285 129532 1053...</td>\n",
       "      <td>816903 597526 520477 1179558 1033823 758724 63...</td>\n",
       "      <td>14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>581131 165432 7368 957317 1197553 570900 33659...</td>\n",
       "      <td>90540 816903 441039 816903 569138 816903 10343...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>7368 87936 40494 490286 856005 641588 145611 1...</td>\n",
       "      <td>816903 1012629 957974 1033823 328210 947200 65...</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>299237 760651 299237 887082 159592 556634 7489...</td>\n",
       "      <td>563568 1239563 680125 780219 782805 1033823 19...</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>7368 7368 7368 865510 7368 396966 995243 37685...</td>\n",
       "      <td>816903 816903 816903 139132 816903 312320 1103...</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                            article  \\\n",
       "0   0  7368 1252069 365865 755561 1044285 129532 1053...   \n",
       "1   1  581131 165432 7368 957317 1197553 570900 33659...   \n",
       "2   2  7368 87936 40494 490286 856005 641588 145611 1...   \n",
       "3   3  299237 760651 299237 887082 159592 556634 7489...   \n",
       "4   4  7368 7368 7368 865510 7368 396966 995243 37685...   \n",
       "\n",
       "                                            word_seg  class  \n",
       "0  816903 597526 520477 1179558 1033823 758724 63...     14  \n",
       "1  90540 816903 441039 816903 569138 816903 10343...      3  \n",
       "2  816903 1012629 957974 1033823 328210 947200 65...     12  \n",
       "3  563568 1239563 680125 780219 782805 1033823 19...     13  \n",
       "4  816903 816903 816903 139132 816903 312320 1103...     12  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TFIDF构建文本特征"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 词的TFIDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "word_vec = TfidfVectorizer(analyzer='word',\n",
    "            ngram_range=(1,2),\n",
    "            min_df=3, \n",
    "            max_df=0.9,\n",
    "            use_idf=True,\n",
    "            smooth_idf=True, \n",
    "            sublinear_tf=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_term_doc = word_vec.fit_transform(train['word_seg'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_term_doc = word_vec.transform(test['word_seg'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 字的TFIDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "char_vec = TfidfVectorizer(analyzer='word', #char\"我们来到深度之眼\"\n",
    "            ngram_range=(1,2), #(1,3)\n",
    "            min_df=3, \n",
    "            max_df=0.9,\n",
    "            use_idf=True,\n",
    "            smooth_idf=True, \n",
    "            sublinear_tf=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_term_char = char_vec.fit_transform(train['article'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_term_char = char_vec.transform(test['article'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<102277x2820641 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 75739932 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#词\n",
    "train_term_doc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<102277x1241880 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 112400674 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#字\n",
    "train_term_char #[[0],[1]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.sparse import hstack\n",
    "train_term_doc = hstack([train_term_doc, train_term_char])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<102277x4062521 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 188140606 stored elements in COOrdinate format>"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#102277x4062521 \n",
    "train_term_doc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_term_doc = hstack([test_term_doc, test_term_char])\n",
    "#"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<102277x4062521 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 185703775 stored elements in COOrdinate format>"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#102277x4062521 \n",
    "test_term_doc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(<102277x4062521 sparse matrix of type '<class 'numpy.float64'>'\n",
       " \twith 188140606 stored elements in COOrdinate format>,\n",
       " <102277x4062521 sparse matrix of type '<class 'numpy.float64'>'\n",
       " \twith 185703775 stored elements in COOrdinate format>)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_term_doc,test_term_doc"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 逻辑回归构建模型"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "正确的操作应该是这样的："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "lb = LabelEncoder()\n",
    "train['label'] = lb.fit_transform(train['class'].tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/hcq/miniconda3/envs/python3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n",
      "/home/hcq/miniconda3/envs/python3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
      "  \"this warning.\", FutureWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "LogisticRegression(C=4, class_weight=None, dual=True, fit_intercept=True,\n",
       "          intercept_scaling=1, max_iter=100, multi_class='warn',\n",
       "          n_jobs=None, penalty='l2', random_state=None, solver='warn',\n",
       "          tol=0.0001, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "clf = LogisticRegression(C=4, dual=True)\n",
    "clf.fit(train_term_doc,train['label'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_prob = clf.predict_proba(test_term_doc)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 结果提交"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_pred = np.argmax(test_prob,axis=1)\n",
    "test['class'] = lb.inverse_transform(test_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "test[[\"id\",\"class\"]].to_csv(\"submission_baseline_word_char.csv\",index=False,header=True,encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
